#Infer ASVs

##Infer with DADA2 in QIIME2

src/03.create_manifest.sh
src/04.import_data.sh
qiime cutadapt trim-paired --i-demultiplexed-sequences results/01.demux_import_data.qza --p-front-f CCTACGGGNGGCWGCAG  --p-front-r GACTACHVGGGTATCTAATCC --o-trimmed-sequences results/02.demux_remove_adapter.qza

qiime demux summarize --i-data results/02.demux_remove_adapter.qza --o-visualization results/02.demux_remove_adapter.qzv
qiime dada2 denoise-paired --i-demultiplexed-seqs results/02.demux_remove_adapter.qza --p-trunc-len-f 260 --p-trunc-len-r 280 --p-max-ee-f 4.0 --p-max-ee-r 4.0 --p-min-overlap 8 --p-pooling-method pseudo --o-representative-sequences results/v1_260-280/rep-seqs-v1-260-280.qza --o-table results/v1_260-280/feature-table_v1-260-280.qza --o-denoising-stats results/v1_260-280/denoising-stats_v1-260-280.qza --p-n-threads 60

qiime tools export --input-path results/v1_260-280/denoising-stats_v1-260-280.qza --output-path results/v1_260-280/denoising-stats_v1-260-280 
qiime dada2 denoise-paired --i-demultiplexed-seqs results/02.demux_remove_adapter.qza --p-trunc-len-f 250 --p-trunc-len-r 200 --p-max-ee-f 4.0 --p-max-ee-r 4.0 --p-min-overlap 8 --p-pooling-method pseudo --o-representative-sequences results/v2_250-200/rep-seqs-v2-250-200.qza --o-table results/v2_250-200/feature-table_v2-250-200.qza --o-denoising-stats results/v2_250-200/denoising-stats_v2-250-200.qza --p-n-threads 80

qiime tools export --input-path results/v2_250-200/denoising-stats_v2-250-200.qza --output-path results/v2_250-200/denoising-stats_v2-250-200

Prueba con los datos limpios con trimgalore, se repitió la impofrtación y remoción de adaptadores

qiime dada2 denoise-paired --i-demultiplexed-seqs results/02.demux_remove_adapter_trim.qza --p-trunc-len-f 250 --p-trunc-len-r 200 --p-max-ee-f 4.0 --p-max-ee-r 4.0 --p-min-overlap 8 --p-pooling-method pseudo --o-representative-sequences results/v2_250-200/rep-seqs-v2-250-200-trim.qza --o-table results/v2_250-200/feature-table_v2-250-200-trim.qza --o-denoising-stats results/v2_250-200/denoising-stats_v2-250-200-trim.qza --p-n-threads 80

qiime tools export --input-path results/v2_250-200/denoising-stats_v2-250-200-trim.qza --output-path results/v2_250-200/denoising-stats_v2-250-200-trim

##Infer ASVs with DADA2 in R

library(dada2)
#Load trim fastq files and list fastq_path content
fastq_path <- "/axolote/diana/diana/macrogen_all/pisolithus/results/03.cutadapt/"

#Sort file names
Fs <- sort(list.files(fastq_path, pattern="_1.fastq"))
Rs <- sort(list.files(fastq_path, pattern="_2.fastq"))

# Extract sample names
sampleNames <- sapply(strsplit(Fs, "_1"), `[`, 1)
sampleNames
# Add complete path to remove ambiguities errors
Fs <- file.path(fastq_path, Fs)
Rs <- file.path(fastq_path, Rs)

# Quality check plot with only the first fastq file
QCplot_1 <- plotQualityProfile(c(rbind(Fs[40],Rs[40])))
#pdf("results/plots/02.dada/01.QCplot_1.pdf")
QCplot_1
#dev.off()
QCplot_2 <- plotQualityProfile(c(rbind(Fs[40],Rs[40])))
## Filter

### explore trunclen versions

# Create directory for clean reads
filt_path <- file.path("results/04.Dada2" , "filter_reads") 
if(!file_test("-d", filt_path)) 
  dir.create(filt_path)
filtFs <- file.path(filt_path, paste0(sampleNames, "_F_filt.fastq.gz"))
filtRs <- file.path(filt_path, paste0(sampleNames, "_R_filt.fastq.gz"))

Comparar versiones de trunc length

# Quality control
# V1 
out1 <- filterAndTrim(Fs, filtFs, Rs, filtRs,
                      truncLen=c(280,200),
                      maxN=0, maxEE=c(5,5), truncQ=2, rm.phix=TRUE,
                      compress=TRUE, multithread=TRUE) 
#V2 extra permissive
out2 <- filterAndTrim(Fs, filtFs, Rs, filtRs,
                      truncLen=c(250,200),
                      maxN=0, maxEE=c(5,5), truncQ=2, rm.phix=TRUE,
                      compress=TRUE, multithread=TRUE) 
#V3
out3 <- filterAndTrim(Fs, filtFs, Rs, filtRs,
                      truncLen=c(0,200),
                      maxN=0, maxEE=c(5,5), truncQ=2, rm.phix=TRUE,
                      compress=TRUE, multithread=TRUE) 

0.0.1 Compare versions

#convert double as DataFrame
v1 <- as.data.frame(out1)
v2 <- as.data.frame(out2)
v3 <- as.data.frame(out3)
# percentage function
calculate_percentage <- function(df, group_name) {
  df$percentage <- (df$reads.out / df$reads.in) * 100
  df$version <- group_name
  return(df)
}
# Get percentage to each version
out1_with_percentage <- calculate_percentage(v1, 'out1')
out2_with_percentage <- calculate_percentage(v2, 'out2')
out3_with_percentage <- calculate_percentage(v3, 'out3')
# Combine
combined_data <- rbind(out1_with_percentage, out2_with_percentage, out3_with_percentage)
# Compare in a Boxplot
library(ggplot2)

filter_versions <- ggplot(combined_data, aes(x = version, y = percentage, fill = version)) +
          geom_boxplot() +
          labs(x = "Filter version", y = "Percentage of reads after filter") +
          theme_bw() +
          scale_fill_brewer(palette = "Set2")
pdf("results/plots/02.dada/02.Filter_versions.pdf")
filter_versions
dev.off()
## png 
##   2
filter_versions

#Save info of final version
write.table(out2, file="results/04.Dada2/Dada_clean.tsv", quote=F, sep="\t",col.names=NA) # Table with the totals before and after cleaning

0.1 Error model

#De-replicate to reduce redundance 

derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)

# Add names to de-rep object
names(derepFs) <- sampleNames
names(derepRs) <- sampleNames

#Generate error model 
errF <- learnErrors(derepFs, multithread=TRUE, verbose=TRUE)
errR <- learnErrors(derepRs, multithread=TRUE, verbose=TRUE)

0.2 ASVs inference

pseudo pooling explanation here

dadaFs <- dada(derepFs, err=errF, multithread=TRUE, pool = "pseudo", verbose=TRUE)
dadaRs <- dada(derepRs, err=errR, multithread=TRUE, pool = "pseudo", verbose = TRUE)

0.2.1 Merge and remove chimeras

# Merge pairs
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, minOverlap = 8, verbose=TRUE)

# Create ASVs table 
seqtabAll <- makeSequenceTable(mergers)
table(nchar(getSequences(seqtabAll)))

# Remove chimeras
seqtab_nochim <- removeBimeraDenovo(seqtabAll, method="consensus", multithread=TRUE, verbose=TRUE)

0.2.2 Get feature table and representative sequences

# create a new table with each ASV number and its representative sequence
PE.table_tsv_output <- seqtab_nochim
PE.table_tsv_output[PE.table_tsv_output==1]=0 # Don't consider those values that have a single observation per sample, make them 0 (sample singletons)
PE.table_tsv_output <- PE.table_tsv_output[,colSums(PE.table_tsv_output)>1] # filter singleton ASVs across the table

# Export sequences as in fasta format
uniquesToFasta(PE.table_tsv_output, fout="results/04.Dada2/ASVs.fasta", ids=paste("ASV_",1:ncol(PE.table_tsv_output), sep=""))
nochim=PE.table_tsv_output
write.table(cbind("ASVs"=1:nrow(t(PE.table_tsv_output)),"rep_seq"=rownames(t(PE.table_tsv_output))), file="results/04.Dada2/ASV_to_seqs-nochim.tsv", quote=F, sep="\t",row.names=FALSE)

# replace the rep_seq with an incremental ASV number
PE.table_tsv_output <- t(PE.table_tsv_output)
rownames(PE.table_tsv_output) <- paste0("ASV_",1:nrow(PE.table_tsv_output))

# and save ASV table
write.table(PE.table_tsv_output, file="results/04.Dada2/ASV_to_seqs-nochim.tsv", quote=F, sep="\t",col.names=NA)

0.2.3 Get track summary per step

# By using this, we can create a function to automate this for all samples in a set:
getN <- function(x) sum(getUniques(x)) # Where getUniques gets non-repeated sequences from a dada2 object or merger object (joined reads)
track <- cbind(out1, sapply(derepFs, getN), sapply(dadaFs, getN), sapply(dadaRs, getN), rowSums(seqtabAll), rowSums(nochim))
colnames(track) <- c("Raw", "Qual_filter", "Derep", "ASVs R1", "ASVs R2", "Merged", "nonchim")
rownames(track) <- sampleNames
write.table(track, "results/04.Dada2/Seqs_lost_in_ASVs_processing.tsv", col.names=NA, sep="\t")


# Create a quick assesment of sequences lost throughout the process
pdf("results/plots/02.dada/03.sequences_throughout_ASV_process.pdf")

# And same thing for the percentage remaining
matplot(t(track[,-5]/track[,1]*100),type='l',xaxt='n', main="Sequences remaining after each step  - R1 (%)", xlab="Step", ylab=" Percentage of Sequences remaining")
axis(1,at=1:ncol(track[,-5]),labels=colnames(track[,-5]))
# R2
matplot(t(track[,-4]/track[,1]*100),type='l',xaxt='n', main="Sequences remaining after each step  - R2 (%)", xlab="Step", ylab=" Percentage of Sequences remaining")
axis(1,at=1:ncol(track[,-4]),labels=colnames(track[,-4]))

dev.off()

##Add final table
track2 <- data.frame(track)
track2$percentage_used <-(track2$nonchim / track2$Raw) * 100
track2
write.table(track2, "results/04.Dada2/Seqs_lost_in_ASVs_processing_percentage.tsv", col.names=NA, sep="\t")

# Save work so far
save.image(file = "Dada2.RData") 

0.3 Compare QIIME2 and R

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
## /axolote/diana/.local/lib/python3.10/site-packages/matplotlib/projections/__init__.py:63: UserWarning: Unable to import Axes3D. This may be due to multiple versions of Matplotlib being installed (e.g. as a system package and as a pip package). As a result, the 3D projection is not available.
##   warnings.warn("Unable to import Axes3D. This may be due to multiple versions of "
import seaborn as sns
v1 = pd.read_csv('results/01.qiime/v1_260-280/denoising-stats_v1-260-280.qzv/stats.tsv',sep = '\t')
v2 = pd.read_csv('results/01.qiime/v2_250-200/denoising-stats_v2-250-200/stats.tsv',sep = '\t')
v3 = pd.read_csv('results/01.qiime/v2_250-200/denoising-stats_v2-250-200-trim/stats.tsv',sep = '\t')
v4 = pd.read_csv('results/04.Dada2/Seqs_lost_in_ASVs_processing_percentage.tsv',sep = '\t')
def compare_table(sample_id,v1,v2,v3,column_names):
    table = pd.DataFrame()
    table['sample-id']=sample_id
    for i, versions in enumerate([v1,v2,v3]):
        column_name = column_names[i]
        table[column_name] = versions
    return table
    
column_names = ["v1","v2","v3"]
denoising_compare_versions = compare_table(v1['sample-id'],v1['percentage of input non-chimeric'], v2['percentage of input non-chimeric'], v3['percentage of input non-chimeric'], column_names)

denoising_compare_versions['dadaR']=v4['percentage_used']
denoising_compare_versions
##      sample-id       v1       v2       v3      dadaR
## 0    #q2:types  numeric  numeric  numeric        NaN
## 1     DR1014-E    49.89    87.25     89.7  93.849904
## 2   DR1014-SAH    25.01    46.47    48.42  49.732411
## 3   DR1015-SAH    23.81    45.81    47.51  48.452470
## 4     DR1018-E    41.54    74.95     79.7  83.522255
## ..         ...      ...      ...      ...        ...
## 79       SAH-5     8.12    51.63    54.49  55.626792
## 80       SAH-6     7.32    54.69    58.04  59.153486
## 81       SAH-7     5.27    51.98    55.82  57.347706
## 82       SAH-8     5.29     47.8    50.69  52.059677
## 83       SAH-9     7.53    48.97    51.53  52.320221
## 
## [84 rows x 5 columns]
denoising_compare_versions.drop(0, inplace=True)
denoising_compare_versions
##      sample-id     v1     v2     v3      dadaR
## 1     DR1014-E  49.89  87.25   89.7  93.849904
## 2   DR1014-SAH  25.01  46.47  48.42  49.732411
## 3   DR1015-SAH  23.81  45.81  47.51  48.452470
## 4     DR1018-E  41.54  74.95   79.7  83.522255
## 5   DR1018-SAH  23.54   44.9  46.54  47.467611
## ..         ...    ...    ...    ...        ...
## 79       SAH-5   8.12  51.63  54.49  55.626792
## 80       SAH-6   7.32  54.69  58.04  59.153486
## 81       SAH-7   5.27  51.98  55.82  57.347706
## 82       SAH-8   5.29   47.8  50.69  52.059677
## 83       SAH-9   7.53  48.97  51.53  52.320221
## 
## [83 rows x 5 columns]
def get_boxplot(tabla, title="Percentage of Effective Reads", xlabel="Denoising Versions", ylabel="Percentage of Effective Reads" + "\n" + "after clean, merge, and non-chimeric"):
    if tabla.empty or len(tabla.columns) < 2:
        print("the table does not have sufficient data to get boxplot.")
        return
    
    fig, ax = plt.subplots()
    sns.boxplot(data=tabla, ax=ax)
    
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    
    plt.savefig('results/plots/denoising_versions_boxplot.png')
    plt.show()
    
get_boxplot(versions)

1 Get taxonomy and phylogeny

1.1 Train database

Train the Silva database taking V3-V4 amplicon to increase the taxonomic assignment resolution.

mkdir -p data/dbs

#get the database and check its integrity

wget https://data.qiime2.org/2023.5/common/silva-138-99-seqs.qza

md5sum data/dbs/silva-138-99-seqs.qza 
de8886bb2c059b1e8752255d271f3010  data/dbs/silva-138-99-seqs.qza

wget https://data.qiime2.org/2023.5/common/silva-138-99-tax.qza

md5sum data/dbs/silva-138-99-tax.qza 
f12d5b78bf4b1519721fe52803581c3d  data/dbs/silva-138-99-tax.qza
conda activate qiime2-2023.5
#extract specific fragments
qiime feature-classifier extract-reads \
--i-sequences data/dbs/silva-138-99-seqs.qza \
--p-f-primer CCTACGGGNGGCWGCAG --p-r-primer GACTACHVGGGTATCTAATCC \
--p-min-length 250 --p-max-length 450 \
--o-reads data/dbs/silva-138-99-seqs-extracted.qza --p-n-jobs 40
#train the database
qiime feature-classifier fit-classifier-naive-bayes \
--i-reference-reads data/dbs/silva-138-99-seqs-extracted.qza \
--i-reference-taxonomy data/dbs/silva-138-99-tax.qza \
--o-classifier data/dbs/classifier_silva_138_trained.qza
#Taxonomy in QIIME2

1.2 Import data to QIIME2

mkdir -p results/05.taxonomy_qiime/
qiime tools import --input-path results/04.Dada2/ASVs.fasta --type 'FeatureData[Sequence]' --output-path results/05.taxonomy_qiime/ASV_rep_seq.qza

# append missing header to the table for import
cat <(echo -n "#OTU Table") results/04.Dada2/ASV_to_seqs-nochim.tsv > results/05.taxonomy_qiime/temp.txt

# convert to biom
biom convert -i results/05.taxonomy_qiime/temp.txt -o results/05.taxonomy_qiime/temp.biom --table-type="OTU table" --to-hdf5

#create feature table
qiime tools import --input-path results/05.taxonomy_qiime/temp.biom --type 'FeatureTable[Frequency]' --input-format BIOMV210Format --output-path results/05.taxonomy_qiime/ASV_table.qza

1.3 Taxonomy assignment

#taxonomy assignment
qiime feature-classifier classify-sklearn --i-classifier data/dbs/classifier_silva_138_trained.qza --i-reads results/05.taxonomy_qiime/ASV_rep_seq.qza --o-classification results/05.taxonomy_qiime/taxonomy.qza --p-n-jobs 80

#get visualization
qiime metadata tabulate \
  --m-input-file results/05.taxonomy_qiime/taxonomy.qza \
  --o-visualization results/05.taxonomy_qiime/taxonomy.qzv

#get visual fasta to fast compare the taxonomic assignments with the top BLASTn hits for certain ASVs  
qiime feature-table tabulate-seqs \
--i-data results/05.taxonomy_qiime/ASV_rep_seq.qza \
--o-visualization results/05.taxonomy_qiime/ASV_rep_seq.qzv

2 Filters

#Summary of the qza table imported from R
qiime feature-table summarize \
--i-table results/05.taxonomy_qiime/ASV_table.qza \
--o-visualization results/05.taxonomy_qiime/ASV_table_summary.qzv
qiime taxa filter-table --i-table results/05.taxonomy_qiime/ASV_table.qza --i-taxonomy results/05.taxonomy_qiime/taxonomy.qza --p-exclude Archaea,Eukarya, Eukaryota, mitochondria,chloroplast --p-include p__ --o-filtered-table results/05.taxonomy_qiime/ASV_table_filter_aemc.qza

qiime feature-table summarize --i-table results/05.taxonomy_qiime/ASV_table_filter_aemc.qza --o-visualization results/05.taxonomy_qiime/ASV_table_summaryfilter_aemc.qzv

Here I removed all ASVs with a frequency of less than 0.1% of the mean sample depth. This cut-off excludes ASVs that are likely due to MiSeq bleed-through between runs (reported by Illumina to be 0.1% of reads). To calculate this cut-off I identified the mean sample depth, multiplied it by 0.001, and rounded to the nearest integer. This step are describe in this paper

qiime feature-table filter-features --i-table  results/05.taxonomy_qiime/ASV_table_filter_aemc.qza --p-min-samples 1 --p-min-frequency 62 --o-filtered-table results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qza

qiime feature-table summarize --i-table results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qza --o-visualization results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qzv
#remove in fasta sequences
qiime feature-table filter-seqs  --i-table results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qza --i-data results/05.taxonomy_qiime/ASV_rep_seq.qza --o-filtered-data results/05.taxonomy_qiime/ASV_rep_seq_filters.qza

#Exports to text format

#taxonomy
qiime tools export --input-path results/05.taxonomy_qiime/taxonomy.qza --output-path results/05.taxonomy_qiime/exports/taxonomy

#feature table
qiime tools export --input-path results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qza --output-path results/05.taxonomy_qiime/exports/ASV_table

#reformat taxonomy tsv
sed -i -e '1 s/Feature/#Feature/' -e '1 s/Taxon/taxonomy/' results/05.taxonomy_qiime/exports/taxonomy/taxonomy.tsv

#Add taxonomy to feature table
biom add-metadata -i results/05.taxonomy_qiime/exports/ASV_table/feature-table.biom -o results/05.taxonomy_qiime/exports/feature-table_tax.biom --observation-metadata-fp results/05.taxonomy_qiime/exports/taxonomy/taxonomy.tsv --sc-separated results/05.taxonomy_qiime/exports/taxonomy/taxonomy.tsv

#Convert to tsv from biom format
biom convert -i results/05.taxonomy_qiime/exports/feature-table_tax.biom -o results/05.taxonomy_qiime/exports/feature-table_tax.tsv --to-tsv --header-key taxonomy

2.1 Phylogeny

qiime phylogeny align-to-tree-mafft-iqtree \
--p-n-threads auto --i-sequences results/05.taxonomy_qiime/ASV_rep_seq_filters.qza \
--o-alignment results/05.taxonomy_qiime/align.qza \
--o-masked-alignment results/05.taxonomy_qiime/masked-align-iqtree.qza \
--o-tree results/05.taxonomy_qiime/unrooted-tree-iqtree.qza \
--o-rooted-tree results/05.taxonomy_qiime/rooted-tree-iqtree.qza --verbose

3 Diversity

3.1 Explore data

Import data to R from qiime and check prevalence to filter data

# 01. Load data
physeq_qiime2 <- qza_to_phyloseq(
  features = "results/05.taxonomy_qiime/ASV_table_filter_aemc_freq62_1minsamp.qza",
  tree = "results/05.taxonomy_qiime/rooted-tree-iqtree.qza",
  taxonomy = "results/05.taxonomy_qiime/taxonomy.qza",
  metadata = "data/metadata.tsv")
physeq_qiime2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 9231 taxa and 83 samples ]
## sample_data() Sample Data:       [ 83 samples by 8 sample variables ]
## tax_table()   Taxonomy Table:    [ 9231 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 9231 tips and 9230 internal nodes ]
# 02. Explore prevalence
## 02.1 Get prevalence
prevdf = apply(X = otu_table(physeq_qiime2),
               MARGIN = ifelse(taxa_are_rows(physeq_qiime2), yes = 1, no = 2),
               FUN = function(x){sum(x > 0)})
## 02.2 Add taxonomy
prevdf = data.frame(Prevalence = prevdf,
                    TotalAbundance = taxa_sums(physeq_qiime2),
                    tax_table(physeq_qiime2))
## 02.3 Check prevalence at Phylum level
dfprev <- plyr::ddply(prevdf, "Phylum", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})

## 02.4 Get genus prevalence plot
prevalence_genus = subset(prevdf, Genus %in% get_taxa_unique(physeq_qiime2, "Genus"))
prev_genus <- ggplot(prevalence_genus, aes(TotalAbundance, 
  Prevalence /nsamples(physeq_qiime2),color=Genus)) +
  geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) +  geom_point(size = 2, alpha = 0.7) +
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence") +
  facet_wrap(~Phylum) + theme(legend.position="none")

#save prevalence plot
pdf("results/plots/03.diversity/01.Prevalence_Phylum_and_genus.pdf")
prev_genus
dev.off()
prev_genus

filterPhyla = c("GAL15", "Lastescibacterota")
physeq_filter_phyla = subset_taxa(physeq_qiime2, !Phylum %in% filterPhyla)

filterPhyla2 = c("Elusimicrobiota", "Cyanobacteria", "Dependentiae")

physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Phylum %in% filterPhyla2)
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Class %in% filterPhyla2)
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Order %in% filterPhyla2)
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Family %in% filterPhyla2)
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Genus %in% filterPhyla2)
physeq_filter_phyla <- subset_taxa(physeq_filter_phyla, !Species %in% filterPhyla2)
physeq_filter_phyla
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 9215 taxa and 83 samples ]
## sample_data() Sample Data:       [ 83 samples by 8 sample variables ]
## tax_table()   Taxonomy Table:    [ 9215 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 9215 tips and 9214 internal nodes ]
##  Get filter genus prevalence  plot
prevalence_genus_filter = subset(prevdf, Genus %in% get_taxa_unique(physeq_filter_phyla, "Genus"))
prev_genus_filter <- ggplot(prevalence_genus_filter, aes(TotalAbundance, 
  Prevalence /nsamples(physeq_filter_phyla),color=Genus)) +
  geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) +  geom_point(size = 2, alpha = 0.7) +
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence") +
  facet_wrap(~Phylum) + theme(legend.position="none")

#save filter prevalence plot
pdf("results/plots/03.diversity/02.Prevalence_Phylum_and_genus_filter.pdf")
prev_genus_filter
dev.off()
prev_genus_filter

3.2 Number of samples

library(ggplot2)

freq_samples <- plot_frequencies(sample_data(physeq_filter_phyla), "Location", "Source") + theme_bw()
pdf("results/plots/03.diversity/Number_of_Samples.pdf")
freq_samples <- ggplot(freq_samples$data, aes(x = Groups, y = n, fill = Factor)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Number of samples of Source and Location") +
  theme_bw() + scale_color_brewer(palette = "Set2") + scale_fill_brewer(palette = "Set2")
dev.off()
## png 
##   2
freq_samples

3.3 Accumulation curves

## 3.1 FASTER rarefaction curves with Vegan
mat <- as(t(otu_table(physeq_filter_phyla)), "matrix")
raremax <- min(rowSums(mat))
pdf("results/plots/03.diversity/03.Rarefaction_curves_vegan.pdf")
vegan_rarefaction_curves <- system.time(rarecurve(mat, step = 100, sample = raremax, 
                                                  col = "blue", label = FALSE))
dev.off()
#Acumulation curves
library(ranacapa)
## 03.2 Get accumulation curves
acumulation_curves <- ggrare(physeq_filter_phyla, step = 100, color = "Source", label = "sample")
acumulation_curves
#Save at this point
save.image(file = "Dada2.RData") 
load("Dada2.RData")
#custom plot
acumulation_curves_plot <- acumulation_curves + facet_wrap(~Source) +
  labs(title="Accumulative curves") + theme_bw() +
  scale_color_brewer(palette = "Set2") + scale_fill_brewer(palette = "Set2")
acumulation_curves_plot

#save plot
pdf("results/plots/03.diversity/04.Rarefaction_curves_ranacapa_facewrapSource.pdf")
acumulation_curves_plot
dev.off()
#custom plot
acumulation_curves_plot_loc <- acumulation_curves + facet_wrap(~Location) +
  labs(title="Accumulative curves") + theme_bw() + 
  scale_color_brewer(palette = "Set2") + scale_fill_brewer(palette = "Set2")
acumulation_curves_plot_loc

pdf("results/plots/03.diversity/04.Rarefaction_curves_ranacapa_facewrapLoc.pdf")
acumulation_curves_plot_loc
dev.off()

3.4 Alpha diversity

#Get diversity index
diversity_all_index <- microbiome::alpha(physeq_filter_phyla, index = "all")
#save table
write.table(diversity_all_index, "results/06.postprocess/all_alpha_index.tsv", col.names=NA, sep="\t")
location_colors= c("grey24","grey64","grey80")

alpha_source_and_location <-plot_richness(physeq_filter_phyla, color = "Location", x = "Location", measures = c("Observed", "Shannon")) +
  geom_boxplot(aes(fill = Source), alpha=.7) + 
  scale_color_manual(values = location_colors) + scale_fill_brewer(palette = "Set2") + 
  theme_bw() +
  theme(axis.text.x  = element_text(size = 5, angle = 10))
  
ggsave("results/plots/03.diversity/05.Alpha_diversity_source-location.pdf", device = "pdf", dpi = 300, units = "px", width=3650, height=2035, bg = "white")
alpha_source_and_location

3.5 Beta diversity

3.5.1 Bray-Curtis

3.5.1.1 Stress

# Bray NMDS 
nmds_bray <- ordinate(physeq_filter_phyla, method = "NMDS", distance = "bray")
# Get stress value
var_stress_nmds_bray <- round(nmds_bray$stress, 5)
var_stress_nmds_bray
## [1] 0.09511
#checks that the fit is good with shepard plot
stressplot(nmds_bray)

3.5.1.2 ANOSIM

#### ANOSIM
#extract metadata
metadata <- data.frame(phyloseq::sample_data(physeq_filter_phyla),
           check.names = FALSE)
#get significant difference between location with anosim
anosim_source <- anosim(x= as.data.frame(t(otu_table(physeq_filter_phyla))),
       grouping = metadata$Source,
       permutations = 9999, distance = "bray")

anosim_location <- anosim(x= as.data.frame(t(otu_table(physeq_filter_phyla))),
       grouping = metadata$Location,
       permutations = 9999, distance = "bray")
anosim_source
## 
## Call:
## anosim(x = as.data.frame(t(otu_table(physeq_filter_phyla))),      grouping = metadata$Source, permutations = 9999, distance = "bray") 
## Dissimilarity: bray 
## 
## ANOSIM statistic R: 0.1951 
##       Significance: 0.0001 
## 
## Permutation: free
## Number of permutations: 9999
anosim_location
## 
## Call:
## anosim(x = as.data.frame(t(otu_table(physeq_filter_phyla))),      grouping = metadata$Location, permutations = 9999, distance = "bray") 
## Dissimilarity: bray 
## 
## ANOSIM statistic R: 0.8746 
##       Significance: 0.0001 
## 
## Permutation: free
## Number of permutations: 9999

3.5.1.3 NMDS

nmds_bray_plot <- plot_ordination(physeq_filter_phyla, nmds_bray, label = "sample",
                           color = "Location", shape = "Source") + theme_bw() + 
  scale_fill_manual(values = location_colors) + scale_color_manual(values = location_colors)+
  labs(col = "Location") + labs(title="NMDS, Bray-Curtis distance\nANOSIM Location: R=0.8746, p=0.0001, Permutations=9999") +
  geom_point(size=3) + theme_bw() 

nmds_bray_plot <- nmds_bray_plot +
  annotate("text", x = Inf, y = -Inf, label = paste("Stress:", var_stress_nmds_bray),
           hjust = 3.1, vjust = -1.9, size = 3)
nmds_bray_plot

pdf("results/plots/03.diversity/06.NMDS_Shepard_Bray_Fit.pdf")
stressplot(nmds_bray)
dev.off()

pdf("results/plots/03.diversity/07.NMDS_Bray_Location.pdf")
nmds_bray_plot
dev.off()
nmds_bray_plot_2 <- plot_ordination(physeq_filter_phyla, nmds_bray, label = "sample",
                           color = "Source", shape = "Location") + theme_bw() +
  scale_color_brewer(palette = "Set2")+ scale_fill_brewer(palette = "Set2")+
  labs(col = "Source") + labs(title="NMDS, Bray-Curtis distance\nANOSIM Location: R=0.8746, p=0.0001, Permutations=9999") +
  geom_point(size=3) + theme_bw() 

nmds_bray_plot_2 <- nmds_bray_plot_2 +
  annotate("text", x = Inf, y = -Inf, label = paste("Stress:", var_stress_nmds_bray),
           hjust = 3.1, vjust = -1.9, size = 3)
pdf("results/plots/03.diversity/08.NMDS_Bray_Location_2.pdf")
nmds_bray_plot_2
dev.off()
nmds_bray_plot_2

3.5.2 Weighted Unifrac

3.5.2.1 Stress

#Weigthed UniFrac take relative abundance and it is less sensitive to sample size 

nmds_wunifrac <- ordinate(physeq_filter_phyla, method = "NMDS", distance = "wunifrac")
# stress variable
var_stress_nmds_wu <- round(nmds_wunifrac$stress, 5)
var_stress_nmds_wu
## [1] 0.1118
stressplot(nmds_wunifrac)# checks that the fit is good

# Weigthed UniFrac NMDS
nmds_wu <- plot_ordination(physeq_filter_phyla, nmds_wunifrac, label = "Sample",
                           color = "Location", shape = "Location") + theme_bw() + 
  scale_color_brewer(palette = "Set2")+ scale_fill_brewer(palette = "Set2")+
  labs(col = "Location") + labs(title="NMDS, Weighted UniFrac distance") +
  geom_point(size=3) + theme_bw() 

nmds_wu <- nmds_wu +
  annotate("text", x = Inf, y = -Inf, label = paste("Stress:", var_stress_nmds_wu),
           hjust = 1.1, vjust = -1.1, size = 4)


pdf("results/plots/03.diversity/07.NMDS_Shepard_WUniFrac_Fit.pdf")
stressplot(nmds_wunifrac)
dev.off()

pdf("results/plots/03.diversity/08.NMDS_WUniFrac_Location.pdf")
nmds_wu
dev.off()
print(nmds_wu)

3.6 Abundances

### 0.6  abundances
library(ampvis2)
# Necesitamos extraer la tabla de read counts y la tabla de taxonomía del objeto physeq_filter_phyla
# Extraemos las tablas
otu_table_ampvis <- data.frame(OTU = rownames(phyloseq::otu_table(physeq_filter_phyla)@.Data),
                               phyloseq::otu_table(physeq_filter_phyla)@.Data,
                               phyloseq::tax_table(physeq_filter_phyla)@.Data,
                               check.names = FALSE
)
# Metadada
meta_data_ampvis <- data.frame(phyloseq::sample_data(physeq_filter_phyla),
                               check.names = FALSE
)
#  cambiamos el indice por una columna que se llame SampleID

meta_data_ampvis <- meta_data_ampvis %>% rownames_to_column(var = "SampleID")
# ampvis object
av2 <- amp_load(otu_table_ampvis, meta_data_ampvis)
#ploteamos el objeto
pdf("results/plots//ampv_boxplot_abundance_location.pdf")
ampv_boxplot_abundance_location <- amp_boxplot(av2,
            group_by = "Location",
            sort_by = mean,
            plot_type = "boxplot",
            tax_show = 25,
            tax_add = "Phylum",
            normalise = T,
            plot_log = T) +
  scale_color_manual(values = location_colors) +
  scale_fill_manual(values = location_colors)
dev.off()
ampv_boxplot_abundance_location
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 187 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

pdf("results/plots/ampv_boxplot_abundance_source.pdf")
ampv_boxplot_abundance_source <- amp_boxplot(av2,
                                                group_by = "Source",
                                                sort_by = mean,
                                                plot_type = "boxplot",
                                                tax_show = 20,
                                                tax_add = "Phylum",
                                                normalise = T,
                                                plot_log = T) +
                                          scale_color_brewer(palette = "Set2")+ 
                                          scale_fill_brewer(palette = "Set2")
dev.off()
ampv_boxplot_abundance_source
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 159 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#rank abundance curves
pdf("results/plots/03.diversity/rank_abundance_curves.pdf")
rank_ab <- amp_rankabundance(av2, log10_x = T, group_by = "Source") +
  scale_color_brewer(palette = "Set2") + 
  scale_fill_brewer(palette = "Set2")
dev.off()

rank_ab
## Warning: Removed 1182 rows containing missing values or values outside the scale range
## (`geom_line()`).


#heatmap abundances
pdf("results/plots/03.diversity/ampv_heatmap_abundances30_genus.pdf")
ampv_heatmap_abundances_genus <- amp_heatmap(av2,
            group_by = "Source",
            facet_by = "Location",
            plot_values = TRUE,
            tax_show = 30,
            tax_aggregate = "Genus",
            tax_add = "Family",
            plot_colorscale = "log10",
            plot_legendbreaks = c(1, 5, 5),
            color_vector = c("white","deepskyblue3","magenta3")
)
dev.off()
ampv_heatmap_abundances_genus
## Warning in scale_fill_gradientn(colours = color.pal, trans = plot_colorscale, :
## log-10 transformation introduced infinite values.

pdf("results/plots/03.diversity/ampv_heatmap_abundances30_family.pdf")
ampv_heatmap_abundances_family <- amp_heatmap(av2,
            group_by = "Source",
            facet_by = "Location",
            plot_values = TRUE,
            tax_show = 30,
            tax_aggregate = "Family",
            tax_add = "Phylum",
            plot_colorscale = "log10",
            plot_legendbreaks = c(1, 5, 5),
            color_vector = c("white","deepskyblue3","magenta3")
)
dev.off()
ampv_heatmap_abundances_family
pdf("results/plots/03.diversity/ampv_heatmap_abundances30_order.pdf")
ampv_heatmap_abundances_order <- amp_heatmap(av2,
            group_by = "Source",
            facet_by = "Location",
            plot_values = TRUE,
            tax_show = 35,
            tax_aggregate = "Order",
            tax_add = "Phylum",
            plot_colorscale = "log10",
            plot_legendbreaks = c(1, 5, 5),
            color_vector = c("white","deepskyblue3","magenta3")
)
dev.off()
ampv_heatmap_abundances_order
## Warning in scale_fill_gradientn(colours = color.pal, trans = plot_colorscale, :
## log-10 transformation introduced infinite values.